library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(tidyverse)
Warning: package ‘tidyverse’ was built under R version 4.2.3Warning: package ‘ggplot2’ was built under R version 4.2.3Warning: package ‘tidyr’ was built under R version 4.2.3Warning: package ‘readr’ was built under R version 4.2.3Warning: package ‘purrr’ was built under R version 4.2.3Warning: package ‘forcats’ was built under R version 4.2.3Warning: package ‘lubridate’ was built under R version 4.2.3── Attaching core tidyverse packages ──────────────────────────────── tidyverse 2.0.0 ──
✔ forcats 1.0.0 ✔ readr 2.1.4
✔ ggplot2 3.4.3 ✔ stringr 1.5.0
✔ lubridate 1.9.2 ✔ tibble 3.1.8
✔ purrr 1.0.2 ✔ tidyr 1.3.0── Conflicts ────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(tidyr)
library(corrplot)
Warning: package ‘corrplot’ was built under R version 4.2.3corrplot 0.92 loaded
library(plotly)
Warning: package ‘plotly’ was built under R version 4.2.3Registered S3 method overwritten by 'data.table':
method from
print.data.table
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Attaching package: ‘plotly’
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
library(caTools)
Warning: package ‘caTools’ was built under R version 4.2.3
library(car)
Warning: package ‘car’ was built under R version 4.2.3Loading required package: carData
Warning: package ‘carData’ was built under R version 4.2.3
Attaching package: ‘car’
The following object is masked from ‘package:purrr’:
some
The following object is masked from ‘package:dplyr’:
recode
Mat = read.csv('studentMat.csv')
non_numeric_columns <- names(Mat)[sapply(Mat, function(col) !is.numeric(col))]
cat("These are non-numeric columns:", non_numeric_columns, "\n\n")
These are non-numeric columns: school sex address famsize Pstatus Mjob Fjob reason guardian schoolsup famsup paid activities nursery higher internet romantic
numeric_columns <- names(Mat)[sapply(Mat, is.numeric)]
cat("These are numeric columns:", numeric_columns, "\n")
These are numeric columns: age Medu Fedu traveltime studytime failures famrel freetime goout Dalc Walc health absences G1 G2 G3
We have to convert Non numeric data to numeric data to find the correlation coeff
# Mat$sex <- ifelse(Mat$sex == 'M', 1, 0)
# non_numeric_columns <- c('school', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic')
# for(col in non_numeric_columns) {
# if(length(unique(Mat[[col]])) == 2) {
# # Label Encoding
# levels <- unique(Mat[[col]])
# Mat[[col]] <- ifelse(Mat[[col]] == levels[1], 1, 0)
# } else {
# # One-Hot Encoding
# Mat <- Mat %>%
# mutate(!!col := as.factor(!!sym(col))) %>%
# spread(!!col, !!col, fill = 0, convert = TRUE)
# }
# }
non_numeric <- names(Mat)[sapply(Mat, function(col) !is.numeric(col))]
for (col in non_numeric){
print(paste("The column ", col, "has ", length(unique(Mat[[col]])), ' unique values they are: '))
print((unique(Mat[[col]])))
cat('\n')
}
[1] "The column school has 2 unique values they are: "
[1] "GP" "MS"
[1] "The column sex has 2 unique values they are: "
[1] "F" "M"
[1] "The column address has 2 unique values they are: "
[1] "U" "R"
[1] "The column famsize has 2 unique values they are: "
[1] "GT3" "LE3"
[1] "The column Pstatus has 2 unique values they are: "
[1] "A" "T"
[1] "The column Mjob has 5 unique values they are: "
[1] "at_home" "health" "other" "services" "teacher"
[1] "The column Fjob has 5 unique values they are: "
[1] "teacher" "other" "services" "health" "at_home"
[1] "The column reason has 4 unique values they are: "
[1] "course" "other" "home" "reputation"
[1] "The column guardian has 3 unique values they are: "
[1] "mother" "father" "other"
[1] "The column schoolsup has 2 unique values they are: "
[1] "yes" "no"
[1] "The column famsup has 2 unique values they are: "
[1] "no" "yes"
[1] "The column paid has 2 unique values they are: "
[1] "no" "yes"
[1] "The column activities has 2 unique values they are: "
[1] "no" "yes"
[1] "The column nursery has 2 unique values they are: "
[1] "yes" "no"
[1] "The column higher has 2 unique values they are: "
[1] "yes" "no"
[1] "The column internet has 2 unique values they are: "
[1] "no" "yes"
[1] "The column romantic has 2 unique values they are: "
[1] "no" "yes"
binary_columns <- c()
multi_unique_columns <- c()
# Loop through each column and categorize based on the number of unique values
for (col in non_numeric) {
num_unique <- length(unique(Mat[[col]]))
if (num_unique == 2) {
binary_columns <- c(binary_columns, col)
} else if (num_unique > 2) {
multi_unique_columns <- c(multi_unique_columns, col)
}
}
# for (col in binary_columns) {
# unique_vals <- unique(Mat[[col]])
# Mat[[col]] <- ifelse(Mat[[col]] == unique_vals[1], 0, 1)
# }
print(paste("Binary columns:", paste(binary_columns, collapse = ", ")))
[1] "Binary columns: school, sex, address, famsize, Pstatus, schoolsup, famsup, paid, activities, nursery, higher, internet, romantic"
cat('\n')
print(paste("Multi unique value columns:", paste(multi_unique_columns, collapse = ", ")))
[1] "Multi unique value columns: Mjob, Fjob, reason, guardian"
for (col in binary_columns) {
if ("yes" %in% Mat[[col]] && "no" %in% Mat[[col]]) {
# If the column has 'yes' and 'no' values
Mat[[col]] <- ifelse(Mat[[col]] == "yes", 1, 0)
} else {
# For other binary columns
unique_vals <- unique(Mat[[col]])
Mat[[col]] <- ifelse(Mat[[col]] == unique_vals[1], 0, 1)
print(paste(unique_vals[1],':0', unique_vals[2],':1'))
}
}
[1] "GP :0 MS :1"
[1] "F :0 M :1"
[1] "U :0 R :1"
[1] "GT3 :0 LE3 :1"
[1] "A :0 T :1"
# Perform one-hot encoding for each column in multi_unique_columns
for (col in multi_unique_columns) {
# Create a one-hot encoded matrix for the column
formula_str <- paste("~ 0 +", col)
one_hot <- model.matrix(as.formula(formula_str), data = Mat)
# Convert matrix to data frame and set column names
one_hot_df <- as.data.frame(one_hot)
colnames(one_hot_df) <- gsub("^.\\.", col, colnames(one_hot_df))
# Bind the new one-hot encoded columns to the original data frame
Mat <- cbind(Mat, one_hot_df)
# Remove the original column
Mat[[col]] <- NULL
}
# View the first few rows of the processed data to verify
head(Mat)
NA
cor_matrix <- cor(Mat, use = "complete.obs")
corrplot(cor_matrix, method = "circle")
plot_ly(
x = colnames(cor_matrix),
y = rownames(cor_matrix),
z = cor_matrix,
type = "heatmap",
colorscale = "Viridis"
) %>%
layout(title_text = "Correlation Matrix Heatmap")
Warning: 'layout' objects don't have these attributes: 'title_text'
Valid attributes include:
'_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'
Warning: 'layout' objects don't have these attributes: 'title_text'
Valid attributes include:
'_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'
G3_correlation <- cor_matrix[,'G3']
library(plotly)
plot_ly(x = names(G3_correlation), y = G3_correlation, type = 'bar') %>%
layout(title = "Correlation of G3 with Other Variables", yaxis = list(title = "Correlation Coefficient"))
NA
print(G3_correlation)
school sex age address famsize
-0.04501694 0.10345565 -0.16157944 -0.10575606 0.08140711
Pstatus Medu Fedu traveltime studytime
-0.05800898 0.21714750 0.15245694 -0.11714205 0.09781969
failures schoolsup famsup paid activities
-0.36041494 -0.08278821 -0.03915715 0.10199624 0.01609970
nursery higher internet romantic famrel
0.05156790 0.18246462 0.09848337 -0.12996995 0.05136343
freetime goout Dalc Walc health
0.01130724 -0.13279147 -0.05466004 -0.05193932 -0.06133460
absences G1 G2 G3 Mjobat_home
0.03424732 0.80146793 0.90486799 1.00000000 -0.11563430
Mjobhealth Mjobother Mjobservices Mjobteacher Fjobat_home
0.11615799 -0.09647737 0.07842887 0.05771238 -0.01338457
Fjobhealth Fjobother Fjobservices Fjobteacher reasoncourse
0.05711055 -0.05348341 -0.01610783 0.09537401 -0.09894964
reasonhome reasonother reasonreputation guardianfather guardianmother
-0.02135917 0.05200772 0.09569223 0.03249322 0.02233775
guardianother
-0.08777445
# Sorting correlations in decreasing order to get top positive correlations
top_positive <- sort(G3_correlation, decreasing = TRUE)[1:10]
# Sorting correlations in increasing order to get top negative correlations
top_negative <- sort(G3_correlation)[1:10]
print("Top Positive Correlations:")
[1] "Top Positive Correlations:"
print(top_positive)
G3 G2 G1 Medu higher Fedu Mjobhealth sex
1.00000000 0.90486799 0.80146793 0.21714750 0.18246462 0.15245694 0.11615799 0.10345565
paid internet
0.10199624 0.09848337
print("Top Negative Correlations:")
[1] "Top Negative Correlations:"
print(top_negative)
failures age goout romantic traveltime Mjobat_home
-0.36041494 -0.16157944 -0.13279147 -0.12996995 -0.11714205 -0.11563430
address reasoncourse Mjobother guardianother
-0.10575606 -0.09894964 -0.09647737 -0.08777445
# Sort G3_correlation in increasing order
sorted_names <- names(sort(G3_correlation))
# Convert variable names to a factor with levels specified by the sorted order
factor_names <- factor(names(G3_correlation), levels = sorted_names)
# Plot using plotly
plot_ly(x = factor_names, y = G3_correlation, type = 'bar') %>%
layout(title = "Correlation of G3 with Other Variables in Increasing Order",
yaxis = list(title = "Correlation Coefficient"))
NA
NA
Mat_alc <- Mat
Mat_alc$Avg_alc <- (Mat_alc$Dalc + Mat_alc$Walc)/2
Mat_alc[,c("Avg_alc", "Dalc", "Walc")]
Mat_alc <- Mat_alc %>% select(-Dalc, -Walc)
Mat_alc
Mat_alc <- Mat_alc %>% select(-G1, -G2,-G3)
corr_mat = cor(Mat_alc)
corrplot(corr_mat)
Avg_alc_correlation <- corr_mat[,'Avg_alc']
plot_ly(x = names(Avg_alc_correlation), y = Avg_alc_correlation, type = 'bar') %>%
layout(title = "Correlation of Avg_alc with Other Variables", yaxis = list(title = "Correlation Coefficient"))
sorted_names <- names(sort(Avg_alc_correlation))
factor_names <- factor(names(Avg_alc_correlation), levels = sorted_names)
plot_ly(x = factor_names, y = Avg_alc_correlation, type = 'bar') %>%
layout(title = "Correlation of Avg_alc with Other Variables in Increasing Order",
yaxis = list(title = "Correlation Coefficient"))
NA
Avg_alc_correlation
school sex age address famsize
0.0935079933 0.2983306751 0.1349722737 0.1075991608 0.1126945704
Pstatus Medu Fedu traveltime studytime
-0.0098078682 -0.0216807701 -0.0071265013 0.1491336864 -0.2526978701
failures schoolsup famsup paid activities
0.1532033260 -0.0662068126 -0.0704349862 0.0672743608 -0.0541766778
nursery higher internet romantic famrel
-0.1026850313 -0.0964658432 0.0238390654 0.0002056756 -0.1084265610
freetime goout health absences Mjobat_home
0.1897535472 0.3926829382 0.0946623616 0.1386874792 -0.0091633057
Mjobhealth Mjobother Mjobservices Mjobteacher Fjobat_home
-0.0193705820 -0.0076611840 0.0100910644 0.0224310187 -0.0899757864
Fjobhealth Fjobother Fjobservices Fjobteacher reasoncourse
-0.0606312159 -0.0014299118 0.1088181083 -0.0606931394 0.0080510716
reasonhome reasonother reasonreputation guardianfather guardianmother
0.0138185842 0.1296121075 -0.1072067611 0.0289511097 -0.0187935792
guardianother Avg_alc
-0.0126818299 1.0000000000
Mat_alc
# Assuming your data is in a variable named 'Mat_alc'
# 2. Split the data into training and testing sets
set.seed(123) # Setting seed for reproducibility
split = sample.split(Mat_alc$Avg_alc, SplitRatio = 0.8)
train_data = subset(Mat_alc, split == TRUE)
test_data = subset(Mat_alc, split == FALSE)
# 3. Train a linear regression model using the training set
model <- lm(Avg_alc ~ ., data = train_data) # The dot means we are using all other columns as predictors
# 4. Evaluate the model using the testing set
predictions = predict(model, newdata = test_data)
Warning: prediction from a rank-deficient fit may be misleading
mse = mean((predictions - test_data$Avg_alc)^2) # Mean Squared Error
print(mse)
[1] 0.7233548
# Additionally, you can print a summary of the model to inspect coefficients and other statistics
print(summary(model))
Call:
lm(formula = Avg_alc ~ ., data = train_data)
Residuals:
Min 1Q Median 3Q Max
-2.07721 -0.52527 -0.05375 0.41627 2.98475
Coefficients: (4 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.322e-01 1.162e+00 0.544 0.586784
school -1.641e-01 1.930e-01 -0.850 0.396053
sex 3.533e-01 1.088e-01 3.248 0.001303 **
age 1.663e-02 5.030e-02 0.331 0.741126
address 3.879e-01 1.299e-01 2.986 0.003076 **
famsize 1.463e-01 1.088e-01 1.345 0.179763
Pstatus -1.937e-01 1.590e-01 -1.218 0.224242
Medu -4.710e-02 7.168e-02 -0.657 0.511704
Fedu 6.931e-02 6.231e-02 1.112 0.266903
traveltime 1.332e-01 7.649e-02 1.741 0.082738 .
studytime -1.043e-01 6.487e-02 -1.608 0.109058
failures 2.616e-05 7.465e-02 0.000 0.999721
schoolsup -1.563e-01 1.523e-01 -1.026 0.305609
famsup -1.549e-02 1.069e-01 -0.145 0.884877
paid 1.886e-01 1.057e-01 1.784 0.075578 .
activities -2.104e-01 9.900e-02 -2.125 0.034478 *
nursery -2.773e-01 1.221e-01 -2.271 0.023906 *
higher 9.098e-02 2.474e-01 0.368 0.713402
internet 4.596e-02 1.395e-01 0.329 0.742087
romantic -1.356e-01 1.073e-01 -1.263 0.207541
famrel -2.024e-01 5.247e-02 -3.858 0.000142 ***
freetime 6.878e-02 5.369e-02 1.281 0.201243
goout 3.368e-01 4.647e-02 7.248 4.19e-12 ***
health 6.786e-02 3.625e-02 1.872 0.062276 .
absences 9.089e-03 6.263e-03 1.451 0.147860
Mjobat_home -4.918e-02 2.409e-01 -0.204 0.838389
Mjobhealth -1.625e-01 2.114e-01 -0.769 0.442661
Mjobother 4.914e-03 1.876e-01 0.026 0.979118
Mjobservices -1.105e-01 1.729e-01 -0.639 0.523327
Mjobteacher NA NA NA NA
Fjobat_home 2.854e-02 3.249e-01 0.088 0.930063
Fjobhealth 1.710e-01 2.960e-01 0.578 0.564066
Fjobother 1.854e-01 2.118e-01 0.876 0.381918
Fjobservices 4.603e-01 2.175e-01 2.116 0.035209 *
Fjobteacher NA NA NA NA
reasoncourse -3.971e-02 1.275e-01 -0.312 0.755613
reasonhome 5.426e-02 1.327e-01 0.409 0.682887
reasonother 1.970e-01 1.960e-01 1.005 0.315694
reasonreputation NA NA NA NA
guardianfather 1.541e-01 2.236e-01 0.689 0.491250
guardianmother 3.250e-02 2.086e-01 0.156 0.876331
guardianother NA NA NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8229 on 278 degrees of freedom
Multiple R-squared: 0.3871, Adjusted R-squared: 0.3055
F-statistic: 4.745 on 37 and 278 DF, p-value: 1.385e-14
# Removing redudant variables for 1-hot encoding
Mat_alc <- Mat_alc %>% select(-guardianother, -reasonreputation, -Fjobteacher, -Mjobteacher)
set.seed(123)
split = sample.split(Mat_alc$Avg_alc, SplitRatio = 0.8)
train_data = subset(Mat_alc, split == TRUE)
test_data = subset(Mat_alc, split == FALSE)
model <- lm(Avg_alc ~ ., data = train_data)
predictions = predict(model, newdata = test_data)
mse = mean((predictions - test_data$Avg_alc)^2)
print(mse)
[1] 0.7233548
print(summary(model))
Call:
lm(formula = Avg_alc ~ ., data = train_data)
Residuals:
Min 1Q Median 3Q Max
-2.07721 -0.52527 -0.05375 0.41627 2.98475
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.322e-01 1.162e+00 0.544 0.586784
school -1.641e-01 1.930e-01 -0.850 0.396053
sex 3.533e-01 1.088e-01 3.248 0.001303 **
age 1.663e-02 5.030e-02 0.331 0.741126
address 3.879e-01 1.299e-01 2.986 0.003076 **
famsize 1.463e-01 1.088e-01 1.345 0.179763
Pstatus -1.937e-01 1.590e-01 -1.218 0.224242
Medu -4.710e-02 7.168e-02 -0.657 0.511704
Fedu 6.931e-02 6.231e-02 1.112 0.266903
traveltime 1.332e-01 7.649e-02 1.741 0.082738 .
studytime -1.043e-01 6.487e-02 -1.608 0.109058
failures 2.616e-05 7.465e-02 0.000 0.999721
schoolsup -1.563e-01 1.523e-01 -1.026 0.305609
famsup -1.549e-02 1.069e-01 -0.145 0.884877
paid 1.886e-01 1.057e-01 1.784 0.075578 .
activities -2.104e-01 9.900e-02 -2.125 0.034478 *
nursery -2.773e-01 1.221e-01 -2.271 0.023906 *
higher 9.098e-02 2.474e-01 0.368 0.713402
internet 4.596e-02 1.395e-01 0.329 0.742087
romantic -1.356e-01 1.073e-01 -1.263 0.207541
famrel -2.024e-01 5.247e-02 -3.858 0.000142 ***
freetime 6.878e-02 5.369e-02 1.281 0.201243
goout 3.368e-01 4.647e-02 7.248 4.19e-12 ***
health 6.786e-02 3.625e-02 1.872 0.062276 .
absences 9.089e-03 6.263e-03 1.451 0.147860
Mjobat_home -4.918e-02 2.409e-01 -0.204 0.838389
Mjobhealth -1.625e-01 2.114e-01 -0.769 0.442661
Mjobother 4.914e-03 1.876e-01 0.026 0.979118
Mjobservices -1.105e-01 1.729e-01 -0.639 0.523327
Fjobat_home 2.854e-02 3.249e-01 0.088 0.930063
Fjobhealth 1.710e-01 2.960e-01 0.578 0.564066
Fjobother 1.854e-01 2.118e-01 0.876 0.381918
Fjobservices 4.603e-01 2.175e-01 2.116 0.035209 *
reasoncourse -3.971e-02 1.275e-01 -0.312 0.755613
reasonhome 5.426e-02 1.327e-01 0.409 0.682887
reasonother 1.970e-01 1.960e-01 1.005 0.315694
guardianfather 1.541e-01 2.236e-01 0.689 0.491250
guardianmother 3.250e-02 2.086e-01 0.156 0.876331
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8229 on 278 degrees of freedom
Multiple R-squared: 0.3871, Adjusted R-squared: 0.3055
F-statistic: 4.745 on 37 and 278 DF, p-value: 1.385e-14
vif_model <- vif(model)
print(vif_model)
school sex age address famsize
1.538024 1.375623 1.810255 1.385124 1.161434
Pstatus Medu Fedu traveltime studytime
1.161829 2.862306 2.195068 1.312992 1.357516
failures schoolsup famsup paid activities
1.454386 1.221372 1.255041 1.296218 1.142498
nursery higher internet romantic famrel
1.161971 1.291811 1.248884 1.181200 1.101477
freetime goout health absences Mjobat_home
1.309819 1.247316 1.177664 1.270030 3.183240
Mjobhealth Mjobother Mjobservices Fjobat_home Fjobhealth
1.791567 3.799501 2.702002 1.799030 1.848736
Fjobother Fjobservices reasoncourse reasonhome reasonother
5.155180 4.496349 1.755050 1.627323 1.400665
guardianfather guardianmother
4.182718 4.393150
vif_model[vif_model >= 5]
Fjobother
5.15518
Dropping Fjobother because the VIF value > 5 Generally, * 1 = not correlated. * Between 1 and 5 = moderately correlated. * Greater than 5 = highly correlated.
vif_model[vif_model > 3 & vif_model < 5]
Mjobat_home Mjobother Fjobservices guardianfather guardianmother
3.183240 3.799501 4.496349 4.182718 4.393150
vif_model[vif_model <=1]
named numeric(0)
Mat_alc <- Mat_alc %>% select(-Fjobother)
set.seed(123)
split = sample.split(Mat_alc$Avg_alc, SplitRatio = 0.8)
train_data = subset(Mat_alc, split == TRUE)
test_data = subset(Mat_alc, split == FALSE)
model <- lm(Avg_alc ~ ., data = train_data)
predictions = predict(model, newdata = test_data)
mse = mean((predictions - test_data$Avg_alc)^2)
print(mse)
[1] 0.740984
print(summary(model))
Call:
lm(formula = Avg_alc ~ ., data = train_data)
Residuals:
Min 1Q Median 3Q Max
-2.05284 -0.51987 -0.07558 0.40831 2.99398
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.842181 1.136370 0.741 0.459247
school -0.156950 0.192762 -0.814 0.416215
sex 0.353346 0.108724 3.250 0.001296 **
age 0.015411 0.050257 0.307 0.759343
address 0.392068 0.129753 3.022 0.002747 **
famsize 0.148944 0.108718 1.370 0.171790
Pstatus -0.197511 0.158871 -1.243 0.214832
Medu -0.046960 0.071652 -0.655 0.512752
Fedu 0.055838 0.060352 0.925 0.355656
traveltime 0.129915 0.076367 1.701 0.090023 .
studytime -0.104700 0.064842 -1.615 0.107503
failures -0.003079 0.074530 -0.041 0.967081
schoolsup -0.169465 0.151445 -1.119 0.264111
famsup -0.012351 0.106756 -0.116 0.907979
paid 0.195821 0.105348 1.859 0.064107 .
activities -0.203463 0.098642 -2.063 0.040071 *
nursery -0.280195 0.121990 -2.297 0.022367 *
higher 0.082193 0.247138 0.333 0.739703
internet 0.055174 0.139074 0.397 0.691874
romantic -0.143141 0.106950 -1.338 0.181858
famrel -0.201111 0.052425 -3.836 0.000155 ***
freetime 0.065689 0.053551 1.227 0.220985
goout 0.340353 0.046276 7.355 2.14e-12 ***
health 0.067706 0.036236 1.868 0.062741 .
absences 0.009262 0.006258 1.480 0.139982
Mjobat_home -0.026317 0.239376 -0.110 0.912537
Mjobhealth -0.133108 0.208617 -0.638 0.523963
Mjobother 0.035265 0.184246 0.191 0.848350
Mjobservices -0.092509 0.171619 -0.539 0.590293
Fjobat_home -0.137286 0.263865 -0.520 0.603274
Fjobhealth 0.026263 0.245520 0.107 0.914890
Fjobservices 0.300141 0.117694 2.550 0.011301 *
reasoncourse -0.039207 0.127415 -0.308 0.758529
reasonhome 0.056190 0.132614 0.424 0.672107
reasonother 0.170396 0.193544 0.880 0.379402
guardianfather 0.133871 0.222282 0.602 0.547492
guardianmother 0.018863 0.207958 0.091 0.927793
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8226 on 279 degrees of freedom
Multiple R-squared: 0.3854, Adjusted R-squared: 0.3061
F-statistic: 4.86 on 36 and 279 DF, p-value: 8.419e-15
Important Features based on the Significance codes sex, address, traveltime, paid, activities, nursery, famrel, goout, health, Fjobservices
model_summary <- summary(model)
estimates <- model_summary$coefficients[, "Estimate"]
ordered_estimates_desc <- estimates[order(-estimates)]
print(ordered_estimates_desc)
(Intercept) address sex goout Fjobservices
0.842180660 0.392068180 0.353346417 0.340353023 0.300140840
paid reasonother famsize guardianfather traveltime
0.195821395 0.170395744 0.148943530 0.133870884 0.129914894
higher health freetime reasonhome Fedu
0.082192675 0.067706007 0.065689148 0.056189569 0.055837905
internet Mjobother Fjobhealth guardianmother age
0.055174216 0.035265094 0.026263021 0.018862618 0.015411085
absences failures famsup Mjobat_home reasoncourse
0.009261926 -0.003078618 -0.012350996 -0.026316601 -0.039207470
Medu Mjobservices studytime Mjobhealth Fjobat_home
-0.046960243 -0.092508832 -0.104700495 -0.133108429 -0.137286283
romantic school schoolsup Pstatus famrel
-0.143140525 -0.156949701 -0.169464562 -0.197511036 -0.201110639
activities nursery
-0.203463393 -0.280194864
library(ggplot2)
# Convert ordered estimates to a data frame
df_estimates <- data.frame(Predictor = names(ordered_estimates_desc),
Estimate = ordered_estimates_desc)
# Plot using ggplot2
plot <- ggplot(df_estimates, aes(x = reorder(Predictor, Estimate), y = Estimate)) +
geom_bar(stat = "identity", fill = "lightblue") +
coord_flip() +
labs(title = "Ordered Estimates from the Model",
x = "Predictors",
y = "Coefficient Value") +
theme_minimal()
plot